Introduction to Open Data Science - Course Project

About the project

Write a short description about the course and add a link to your GitHub repository here. This is an R Markdown (.Rmd) file so you should use R Markdown syntax.

Testi

# This is a so-called "R chunk" where you can write R code.

date()
## [1] "Fri Nov 13 14:39:18 2020"

The text continues here.


Insert chapter 2 title here

Describe the work you have done this week and summarize your learning.

lrn2014 <- read.csv("C:/Users/Juhana/Documents/IODS-project-1/data/learning2014.csv", row.names = 1)

alcGit <- read.csv("https://github.com/JunzQ7/IODS-project/tree/master/data/alc.csv")
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
p <- ggpairs(lrn2014, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20)))
p

model = lm(Points~Attitude+stra+surf, data = lrn2014)
summary(model)
## 
## Call:
## lm(formula = Points ~ Attitude + stra + surf, data = lrn2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1550  -3.4346   0.5156   3.6401  10.8952 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.01711    3.68375   2.991  0.00322 ** 
## Attitude     0.33952    0.05741   5.913 1.93e-08 ***
## stra         0.85313    0.54159   1.575  0.11716    
## surf        -0.58607    0.80138  -0.731  0.46563    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.296 on 162 degrees of freedom
## Multiple R-squared:  0.2074, Adjusted R-squared:  0.1927 
## F-statistic: 14.13 on 3 and 162 DF,  p-value: 3.156e-08
model2 = lm(Points~Attitude, data = lrn2014)
summary(model2)
## 
## Call:
## lm(formula = Points ~ Attitude, data = lrn2014)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.9763  -3.2119   0.4339   4.1534  10.6645 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.63715    1.83035   6.358 1.95e-09 ***
## Attitude     0.35255    0.05674   6.214 4.12e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.32 on 164 degrees of freedom
## Multiple R-squared:  0.1906, Adjusted R-squared:  0.1856 
## F-statistic: 38.61 on 1 and 164 DF,  p-value: 4.119e-09
plot(model2, which = c(1,2,5))


alc <- read.csv("data/alc.csv", row.names = 1)

Selected variables: sex, absences, internet, activities

library(ggplot2)
g1 <- ggplot(alc, aes(x = high_use, y = absences))
g1 + geom_boxplot()

(prop.table(table(alc$sex, alc$high_use), margin = 1))
##    
##         FALSE      TRUE
##   F 0.7878788 0.2121212
##   M 0.6086957 0.3913043
m <- glm(high_use ~ absences + sex + internet + activities, data = alc, family = "binomial")
summary(m)
## 
## Call:
## glm(formula = high_use ~ absences + sex + internet + activities, 
##     family = "binomial", data = alc)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3480  -0.8517  -0.6163   1.1187   2.0931  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.68510    0.35648  -4.727 2.28e-06 ***
## absences       0.09672    0.02315   4.178 2.95e-05 ***
## sexM           1.02724    0.24411   4.208 2.57e-05 ***
## internetyes    0.02369    0.33998   0.070    0.944    
## activitiesyes -0.38679    0.23891  -1.619    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 465.68  on 381  degrees of freedom
## Residual deviance: 427.43  on 377  degrees of freedom
## AIC: 437.43
## 
## Number of Fisher Scoring iterations: 4
coef(m)
##   (Intercept)      absences          sexM   internetyes activitiesyes 
##   -1.68509737    0.09672074    1.02724315    0.02369222   -0.38678774
probabilities <- predict(m, type = "response")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# add the predicted probabilities to 'alc'
alc <- mutate(alc, probability = probabilities)

# use the probabilities to make a prediction of high_use
alc <- mutate(alc, prediction = probability > 0.5)


# tabulate the target variable versus the predictions
table(high_use = alc$high_use, prediction = alc$prediction)
##         prediction
## high_use FALSE TRUE
##    FALSE   257   11
##    TRUE     88   26
# access dplyr and ggplot2
library(dplyr); library(ggplot2)

# initialize a plot of 'high_use' versus 'probability' in 'alc'
g <- ggplot(alc, aes(x = probability, y = high_use, col = prediction))

# define the geom as points and draw the plot
g + geom_point()

# tabulate the target variable versus the predictions
table(high_use = alc$high_use, prediction = alc$prediction) %>% prop.table %>% addmargins
##         prediction
## high_use      FALSE       TRUE        Sum
##    FALSE 0.67277487 0.02879581 0.70157068
##    TRUE  0.23036649 0.06806283 0.29842932
##    Sum   0.90314136 0.09685864 1.00000000

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data(Boston)
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
boston_scaled = scale(Boston)
str(boston_scaled)
##  num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:506] "1" "2" "3" "4" ...
##   ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
##  - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
##   ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
##  - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
##   ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)

n <- nrow(boston_scaled)
# choose randomly 80% of the rows
ind <- sample(n,  size = n * 0.8)

# create train set
train <- boston_scaled[ind,]

# create test set 
test <- boston_scaled[-ind,]

# save the correct classes from test data
correct_classes <- test$crime

# remove the crime variable from test data
test <- dplyr::select(test, -crime)

# LDA
lda.fit <- lda(crime ~ ., data = train)

# print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
## 
## Prior probabilities of groups:
##       low   med_low  med_high      high 
## 0.2648515 0.2500000 0.2524752 0.2326733 
## 
## Group means:
##                   zn      indus        chas        nox          rm        age
## low       0.97158426 -0.9599453 -0.12514775 -0.9020608  0.43687291 -0.8884058
## med_low  -0.06653482 -0.3089421 -0.03844192 -0.5644652 -0.08142222 -0.3802503
## med_high -0.36785679  0.1959520  0.22945822  0.4353902  0.13514308  0.4274404
## high     -0.48724019  1.0172896 -0.14667693  1.0170909 -0.37162929  0.8103798
##                 dis        rad        tax     ptratio       black       lstat
## low       0.8957949 -0.6920711 -0.7123074 -0.48237641  0.37701997 -0.78089619
## med_low   0.3453090 -0.5418150 -0.4504077 -0.07595779  0.32825882 -0.18599640
## med_high -0.3856212 -0.4020080 -0.3005790 -0.36121195  0.07567687  0.06023923
## high     -0.8461426  1.6363892  1.5128120  0.77875205 -0.80524897  0.79632665
##                 medv
## low       0.49069664
## med_low   0.03314276
## med_high  0.17956820
## high     -0.63315929
## 
## Coefficients of linear discriminants:
##                 LD1         LD2         LD3
## zn       0.10370719  0.64781155 -0.93131878
## indus    0.07193844 -0.52660054  0.34914942
## chas    -0.08806524 -0.09837703  0.04445874
## nox      0.41691658 -0.68410797 -1.27575067
## rm      -0.08873134 -0.13626175 -0.16449397
## age      0.23746166 -0.23530082 -0.25111561
## dis     -0.03598091 -0.30312595  0.02921287
## rad      3.19201223  0.74005044 -0.13677099
## tax     -0.04637429  0.33491159  0.54341675
## ptratio  0.13937557  0.02441220 -0.21080271
## black   -0.16510102  0.02525885  0.15853548
## lstat    0.26555334 -0.37318313  0.32018141
## medv     0.23700719 -0.48666180 -0.13153044
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9433 0.0452 0.0115
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}

# target classes as numeric
classes <- as.numeric(train$crime)

# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)

# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low        9       8        3    0
##   med_low    1      21        3    0
##   med_high   0      12       11    1
##   high       0       0        0   33
# load MASS and Boston
library(MASS)
library(ggplot2)
data('Boston')

Boston <- scale(Boston)

# euclidean distance matrix
dist_eu <- dist(Boston)

# look at the summary of the distances
summary(dist_eu)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970
# manhattan distance matrix
dist_man <- dist(Boston, method = 'manhattan')

# look at the summary of the distances
summary(dist_man)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2662  8.4832 12.6090 13.5488 17.7568 48.8618
# k-means clustering
km <-kmeans(Boston, centers = 3)

# plot the Boston dataset with clusters
pairs(Boston, col = km$cluster)

set.seed(123)

# determine the number of clusters
k_max <- 10

# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})

# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')

data(Boston)
Boston <- as.data.frame(scale(Boston))
km <- kmeans(Boston, centers = 5)
Boston$km <- km$cluster
lda.fit <- lda(km~., Boston)
classes <- as.numeric(Boston$km)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

library(MASS)
data(Boston)
boston_scaled = scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
ind <- sample(n,  size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
correct_classes <- test$crime
test <- dplyr::select(test, -crime)

# LDA
lda.fit <- lda(crime ~ ., data = train)
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = as.numeric(train$crime))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.